List of used packages:
library(limma)
library(readxl)
library(readr)
library(impute)
library(ggbiplot)
library(reactable)
library(heatmaply)
library(EnhancedVolcano)
library(ggVennDiagram)
We had two softs for mass-spectrometry results analysis: Peaks Xpro and MaxQuant. Firstly, we compared identified proteins using these two softs.
# All data may be found in "Data" directory
maxq <- read_xlsx("raw_proteomics_MaxQuant_filtered.xlsx")
peaks <- read_csv("data_Peaks.csv")
id_list <- list(peaks$Gene_id, maxq$gene_id)
ggVennDiagram(id_list, category.names = c("Peaks", "MaxQuant"), label_alpha = 0) +
scale_fill_gradient(low = "palegreen3", high = "#0a9278") +
labs(title = "Protein identification") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
We chose data, obtained by using Peaks software as it identified much more proteins
# Read raw data obtained from "Peaks" software
data <- read.csv("data_Peaks.csv", stringsAsFactors = F)
# Make tidy data
data$Accession_id <- unlist(lapply(data$Accession, function(x) unlist(strsplit(x, "\\|")[1])[1]))
data$temp <- unlist(lapply(data$Accession, function(x) unlist(strsplit(x, "\\|")[1])[2]))
data$Protein_name <- unlist(lapply(data$temp, function(x) unlist(strsplit(x, "_")[1])[1]))
data <- data[, -111]
# Read factor table
data_factors <- read_xlsx("factor_Peaks.xlsx")
data_factors$Health <- as.factor(data_factors$Health)
data_factors$Differentiation <- as.factor(data_factors$Differentiation)
data_factors$Series <- as.factor(data_factors$Series)
colnames(data)[39:71] <- data_factors$sample
reactable(data[, c(111, 39:71)])
reactable(data_factors)
Proteomics data may possess NA values which should be treated carefully. Here we tried two approaches:
t_data <- t(data[, 39:71])
colnames(t_data) <- data$Protein_name
## Remove columns where the number of NA is greater than half of observations
cond_1 <- colSums(is.na(t_data)) <= nrow(t_data) / 2
t_data_mindrop <- t_data[, cond_1]
sum(colSums(is.na(t_data)) >= nrow(t_data) / 2)
## [1] 1316
## Remove columns where the number of NA is greater than 2
cond_2 <- colSums(is.na(t_data)) < 2
t_data_maxdrop <- t_data[, cond_2]
sum(colSums(is.na(t_data)) > 2)
## [1] 2344
## Imputation of NA
data_maxdrop <- t(impute.knn(t_data_maxdrop, k = 10)$data)
sum(is.na(data_maxdrop))
## [1] 0
data_mindrop <- t(impute.knn(t_data_mindrop, k = 10)$data)
sum(is.na(data_mindrop))
## [1] 0
Application of MAX drop approach resulted in considerable reduction of proteins number (approximately two-thirds of all proteins), whereas usage MIN drop led to more slight decrease in protein number (around one third of all proteins).
Log2 transformation and quantile transformation were used as it had demonstrated the best normalization results.
# MAX DROP (Drop NA if > 2)
data_norm_log_max <- log2(data_maxdrop + 1)
data_norm_quantile_max <- normalizeQuantiles(data_norm_log_max)
boxplot(data_norm_quantile_max,col = "#0a9278",
border = "black",
main = "Log2 + Quantile normalization",
ylab = "intensities",
xlab = "samples")
# MIN DROP (Drop NA if > half of observations)
data_norm_log_min <- log2(data_mindrop + 1)
data_norm_quantile_min <- normalizeQuantiles(data_norm_log_min)
boxplot(data_norm_quantile_min, col = "#0a9278",
border = "black",
main = "Log2 + Quantile normalization",
ylab = "intensities",
xlab = "samples")
We used PCA to decide which data would be analyzed: MAX drop or MIN drop.
# Year
data_pca_maxdrop <- prcomp(t(data_norm_quantile_max), center = T, scale. = F)
ggbiplot(data_pca_maxdrop, ellipse = TRUE, groups = data_factors$Series, labels = NULL, var.axes = FALSE, alpha = 0.7) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "Batch effect in data MAX drop", color = "Year") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Health
ggbiplot(data_pca_maxdrop, ellipse = TRUE, groups = data_factors$Health, labels = NULL, var.axes = FALSE, alpha = 0.7) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "Data classes (Control vs Health)", color = "Group") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Differentiation
ggbiplot(data_pca_maxdrop, ellipse = TRUE, groups = data_factors$Differentiation, labels = NULL, var.axes = FALSE, alpha = 0.7) +
geom_point(aes(col = data_factors$Differentiation)) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "Data classes (Control vs Differentiation)", color = "Group") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Year
data_pca_mindrop <- prcomp(t(data_norm_quantile_min), center = T, scale. = F)
ggbiplot(data_pca_mindrop, ellipse = TRUE, groups = data_factors$Series, labels = NULL, var.axes = FALSE, alpha = 0.7) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "Batch effect in data (MIN drop)", color = "Year") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Health
ggbiplot(data_pca_mindrop, ellipse = TRUE, groups = data_factors$Health, labels = NULL, var.axes = FALSE, alpha = 0.7) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "Data classes (Control vs Health)", color = "Group") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Differentiation
ggbiplot(data_pca_mindrop, ellipse = TRUE, groups = data_factors$Differentiation, labels = NULL, var.axes = FALSE, alpha = 0.7) +
geom_point(aes(col = data_factors$Differentiation)) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "Data classes (Control vs Differentiation)", color = "Group") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
We chose “MAX drop” data for further analysis, because Control and Differentiation classes were more separated from each other than in case of “MIN drop” data.
We have performed differential expression analysis depending on year of experiment in order to somehow estimate the scale of batch effect.
X <- model.matrix(~ Series, data = data_factors)
# Build a linear model for each protein
fit <- lmFit(data_norm_quantile_max, design = X, method = "robust", maxit = 1000)
# Empirical Bayes statistics
efit <- eBayes(fit)
topTable(efit, coef = 2)
## logFC AveExpr t P.Value adj.P.Val B
## CYTB -3.673090 24.16232 -28.29261 4.146187e-25 5.021033e-22 46.86073
## PFD3 3.652494 21.58231 19.10300 1.106861e-19 6.702045e-17 34.85261
## S10A6 -4.305366 28.53789 -17.60786 1.360747e-18 5.492882e-16 32.37892
## DX39B 1.802131 24.41763 15.43026 7.280926e-17 1.821569e-14 28.42657
## NF1 -4.427363 24.00246 -15.41342 7.520928e-17 1.821569e-14 28.39367
## SC61G 1.659376 23.77067 15.19854 1.140337e-16 2.301580e-14 27.98333
## CDC42 1.734035 25.25963 15.06821 1.470974e-16 2.544784e-14 27.72482
## RLA1 5.630592 20.95293 14.75319 2.740664e-16 4.148680e-14 27.11742
## TMOD3 1.557038 24.26026 14.24639 7.615708e-16 1.024736e-13 26.09511
## PP14B 2.875091 22.86225 13.81546 1.854394e-15 2.245671e-13 25.19693
num_spots <- nrow(data_norm_quantile_max)
full_list <- topTable(efit, coef = 2, number = num_spots,
sort.by = "none")
p_above <- full_list$adj.P.Val <= 0.05
dif_data_year_wo_correction <- data_norm_quantile_max[p_above, ]
sum(p_above)
## [1] 707
As you may see, there are 707 “differentially expressed” proteins depending on year of experiment.
Draw heatmap:
heatmaply(dif_data_year_wo_correction, main = "1 Year vs 2 Year", fontsize_row = 1, dendrogram = "col", scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(low = "lightseagreen", high = "orangered3", midpoint = 15))
Draw Volcano plot:
EnhancedVolcano(full_list,
lab = rownames(full_list),
title = "Batch effect connecting with year\nof experiment",
subtitle = NULL,
x = "logFC",
y = "adj.P.Val",
pCutoff = 0.05,
FCcutoff = 0.1,
legend = c("Not significant","Log2FC","Padj","Padj & Log2FC"),
legendPosition = "right",
col = c("lightcyan4","#f57002", "#ee9f02", "#0a9278"))